BINF-F401 - Quantifying human sexual dimorphism with the dimorphism index

Authors

Youmna Ayadi

Kevin Straatman

Published

May 23, 2025

1 Importing and pre-processing the data

Code
RNA_counts_adjusted <- read.csv("data/RNA_counts_adjusted.csv", 
                                header = TRUE, row.names = 1)

covariates_original <- read.csv("data/covariates.csv", header = TRUE)
Code
head(RNA_counts_adjusted)
GTEX.1117F GTEX.111CU GTEX.111FC GTEX.111VG GTEX.111YS GTEX.1122O GTEX.1128S GTEX.113IC GTEX.117YX GTEX.11DXW GTEX.11DXX GTEX.11DZ1 GTEX.11EI6 GTEX.11EM3 GTEX.11EMC GTEX.11EQ8 GTEX.11EQ9 GTEX.11GS4 GTEX.11GSO GTEX.11H98 GTEX.11I78 GTEX.11LCK GTEX.11NUK GTEX.11O72 GTEX.11OF3 GTEX.11ONC GTEX.11P7K GTEX.11P81 GTEX.11P82 GTEX.11PRG GTEX.11TT1 GTEX.11TTK GTEX.11TUW GTEX.11UD1 GTEX.11UD2 GTEX.11WQC GTEX.11XUK GTEX.11ZTS GTEX.11ZTT GTEX.11ZU8 GTEX.11ZUS GTEX.11ZVC GTEX.1211K GTEX.12126 GTEX.1212Z GTEX.12584 GTEX.12696 GTEX.1269C GTEX.12C56 GTEX.12KS4 GTEX.12WSA GTEX.12WSC GTEX.12WSD GTEX.12WSL GTEX.12WSM GTEX.12ZZX GTEX.12ZZY GTEX.12ZZZ GTEX.13111 GTEX.13112 GTEX.13113 GTEX.1313W GTEX.1314G GTEX.131XE GTEX.131XF GTEX.131XW GTEX.131YS GTEX.132AR GTEX.132QS GTEX.133LE GTEX.1399R GTEX.1399T GTEX.1399U GTEX.139D8 GTEX.139T4 GTEX.139T6 GTEX.139T8 GTEX.139TT GTEX.139TU GTEX.139YR GTEX.13CF2 GTEX.13CF3 GTEX.13D11 GTEX.13FH7 GTEX.13FHO GTEX.13FHP GTEX.13FTW GTEX.13FTY GTEX.13G51 GTEX.13IVO GTEX.13JUV GTEX.13JVG GTEX.13N11 GTEX.13N1W GTEX.13N2G GTEX.13NYB GTEX.13NYS GTEX.13NZ8 GTEX.13NZ9 GTEX.13NZA GTEX.13NZB GTEX.13O1R GTEX.13O21 GTEX.13O3P GTEX.13O3Q GTEX.13O61 GTEX.13OVI GTEX.13OVJ GTEX.13OVK GTEX.13OVL GTEX.13OW6 GTEX.13OW7 GTEX.13OW8 GTEX.13PDP GTEX.13PLJ GTEX.13PVQ GTEX.13PVR GTEX.13QBU GTEX.13QJ3 GTEX.13RTJ GTEX.13S7M GTEX.13SLW GTEX.13SLX GTEX.13U4I GTEX.13VXT GTEX.13VXU GTEX.13W3W GTEX.13X6J GTEX.13X6K GTEX.13YAN GTEX.144FL GTEX.144GL GTEX.144GM GTEX.144GO GTEX.145LS GTEX.145LT GTEX.145ME GTEX.145MF GTEX.145MG GTEX.145MI GTEX.145MN GTEX.145MO GTEX.146FH GTEX.146FQ GTEX.146FR GTEX.14753 GTEX.147F3 GTEX.147GR GTEX.147JS GTEX.148VI GTEX.1497J GTEX.14A5H GTEX.14ABY GTEX.14AS3 GTEX.14BIL GTEX.14BIN GTEX.14BMU GTEX.14BMV GTEX.14C39 GTEX.14C5O GTEX.14DAQ GTEX.14DAR GTEX.14E1K GTEX.14E6C GTEX.14E6D GTEX.14E6E GTEX.14H4A GTEX.14ICK GTEX.14ICL GTEX.14JG1 GTEX.14JG6 GTEX.14JIY GTEX.14LLW GTEX.14PHX GTEX.14PII GTEX.14PJ2 GTEX.14PJ3 GTEX.14PJ4 GTEX.14PJ6 GTEX.14PJM GTEX.14PK6 GTEX.14PN3 GTEX.15CHQ GTEX.15DCD GTEX.15DDE GTEX.15ER7 GTEX.15ETS GTEX.15G19 GTEX.15RIE GTEX.15SDE GTEX.15UKP GTEX.169BO GTEX.16AAH GTEX.16BQI GTEX.16GPK GTEX.16MT8 GTEX.16MTA GTEX.16NGA GTEX.16NPX GTEX.16XZY GTEX.16XZZ GTEX.16YQH GTEX.17EUY GTEX.17EVP GTEX.17F96 GTEX.17F97 GTEX.17F98 GTEX.17F9Y GTEX.17HG3 GTEX.17HGU GTEX.17HHE GTEX.17HHY GTEX.17HII GTEX.17KNJ GTEX.17MF6 GTEX.17MFQ GTEX.183FY GTEX.183WM GTEX.18465 GTEX.18A66 GTEX.18A67 GTEX.18A6Q GTEX.18A7A GTEX.18D9A GTEX.18D9U GTEX.18QFQ GTEX.1A32A GTEX.1A3MV GTEX.1A3MW GTEX.1A3MX GTEX.1A8FM GTEX.1A8G6 GTEX.1A8G7 GTEX.1AMEY GTEX.1AMFI GTEX.1AX9I GTEX.1AX9J GTEX.1AX9K GTEX.1AYCT GTEX.1AYD5 GTEX.1B8KE GTEX.1B8KZ GTEX.1B8L1 GTEX.1B8SF GTEX.1B8SG GTEX.1B932 GTEX.1B933 GTEX.1B97I GTEX.1B996 GTEX.1BAJH GTEX.1C2JI GTEX.1C475 GTEX.1C4CL GTEX.1C64O GTEX.1C6VQ GTEX.1C6VR GTEX.1C6VS GTEX.1C6WA GTEX.1CAMQ GTEX.1CAMR GTEX.1CAMS GTEX.1CB4E GTEX.1CB4F GTEX.1CB4G GTEX.1CB4H GTEX.1CB4I GTEX.1CB4J GTEX.1E1VI GTEX.1E2YA GTEX.1EH9U GTEX.1EKGG GTEX.1EMGI GTEX.1EU9M GTEX.1EWIQ GTEX.1EX96 GTEX.1F48J GTEX.1F52S GTEX.1F5PK GTEX.1F6I4 GTEX.1F6RS GTEX.1F75A GTEX.1F75I GTEX.1F75W GTEX.1F88E GTEX.1F88F GTEX.1FIGZ GTEX.1GF9V GTEX.1GF9X GTEX.1GMR2 GTEX.1GMR3 GTEX.1GMR8 GTEX.1GMRU GTEX.1GN1U GTEX.1GN1V GTEX.1GN1W GTEX.1GN2E GTEX.1GN73 GTEX.1GPI7 GTEX.1GTWX GTEX.1GZ2Q GTEX.1GZ4H GTEX.1GZ4I GTEX.1H1CY GTEX.1H1DE GTEX.1H1DF GTEX.1H1E6 GTEX.1H1ZS GTEX.1H23P GTEX.1H2FU GTEX.1H3O1 GTEX.1H3VE GTEX.1H3VY GTEX.1H4P4 GTEX.1HB9E GTEX.1HBPH GTEX.1HBPI GTEX.1HBPM GTEX.1HCU6 GTEX.1HCU7 GTEX.1HCU8 GTEX.1HCUA GTEX.1HCVE GTEX.1HFI6 GTEX.1HFI7 GTEX.1HGF4 GTEX.1HR98 GTEX.1HR9M GTEX.1HSEH GTEX.1HSGN GTEX.1HSKV GTEX.1HSMO GTEX.1HSMP GTEX.1HSMQ GTEX.1HT8W GTEX.1HUB1 GTEX.1I19N GTEX.1I1CD GTEX.1I1GP GTEX.1I1GQ GTEX.1I1GR GTEX.1I1GS GTEX.1I1GT GTEX.1I1GU GTEX.1I1GV GTEX.1I1HK GTEX.1I4MK GTEX.1I6K6 GTEX.1I6K7 GTEX.1ICG6 GTEX.1ICLY GTEX.1ICLZ GTEX.1IDFM GTEX.1IDJD GTEX.1IDJF GTEX.1IDJU GTEX.1IE54 GTEX.1IGQW GTEX.1IKJJ GTEX.1IKK5 GTEX.1IL2U GTEX.1IL2V GTEX.1IOXB GTEX.1IY9M GTEX.1J1OQ GTEX.1J1R8 GTEX.1J8EW GTEX.1J8JJ GTEX.1J8Q3 GTEX.1J8QM GTEX.1JJ6O GTEX.1JJE9 GTEX.1JJEA GTEX.1JK1U GTEX.1JKYN GTEX.1JKYR GTEX.1JMLX GTEX.1JMPY GTEX.1JMQI GTEX.1JMQK GTEX.1JMQL GTEX.1JN1M GTEX.1JN6P GTEX.1JN76 GTEX.1K2DA GTEX.1K9T9 GTEX.1KAFJ GTEX.1KANA GTEX.1KD5A GTEX.1KWVE GTEX.1KXAM GTEX.1L5NE GTEX.1LB8K GTEX.1LBAC GTEX.1LG7Y GTEX.1LG7Z GTEX.1LGRB GTEX.1LH75 GTEX.1LSNL GTEX.1LSNM GTEX.1LSVX GTEX.1LVA9 GTEX.1LVAN GTEX.1LVAO GTEX.1M5QR GTEX.1MA7W GTEX.1MA7X GTEX.1MCC2 GTEX.1MCYP GTEX.1MGNQ GTEX.1MJK2 GTEX.1MUQO GTEX.1N2DW GTEX.1N2EF GTEX.1NHNU GTEX.1OFPY GTEX.1OJC4 GTEX.1PBJI GTEX.1PDJ9 GTEX.1PFEY GTEX.1PIGE GTEX.1PIIG GTEX.1POEN GTEX.1PPGY GTEX.1PPH8 GTEX.1PWST GTEX.1QAET GTEX.1QCLY GTEX.1QEPI GTEX.1QP28 GTEX.1QP29 GTEX.1QP2A GTEX.1QP66 GTEX.1QP67 GTEX.1QP6S GTEX.1QP9N GTEX.1QPFJ GTEX.1QW4Y GTEX.1R46S GTEX.1R7EU GTEX.1R7EV GTEX.1R9K5 GTEX.1R9PM GTEX.1RAZA GTEX.1RAZQ GTEX.1RAZR GTEX.1RDX4 GTEX.1S5ZU GTEX.1S82P GTEX.1S831 GTEX.1S83E GTEX.N7MS GTEX.NFK9 GTEX.NPJ8 GTEX.O5YT GTEX.O5YV GTEX.OHPK GTEX.OHPM GTEX.OHPN GTEX.OIZF GTEX.OIZG GTEX.OIZH GTEX.OIZI GTEX.OOBJ GTEX.OOBK GTEX.OXRK GTEX.OXRL GTEX.OXRN GTEX.OXRO GTEX.OXRP GTEX.P44G GTEX.P44H GTEX.P4PP GTEX.P4PQ GTEX.P4QS GTEX.P4QT GTEX.P78B GTEX.PLZ4 GTEX.PLZ5 GTEX.PLZ6 GTEX.POMQ GTEX.POYW GTEX.PSDG GTEX.PWCY GTEX.PWN1 GTEX.PX3G GTEX.Q2AG GTEX.Q2AH GTEX.Q2AI GTEX.Q734 GTEX.QCQG GTEX.QDT8 GTEX.QDVJ GTEX.QDVN GTEX.QEG4 GTEX.QEG5 GTEX.QEL4 GTEX.QESD GTEX.QLQ7 GTEX.QLQW GTEX.QMRM GTEX.QV31 GTEX.QV44 GTEX.R3RS GTEX.R45C GTEX.R53T GTEX.R55C GTEX.R55D GTEX.R55G GTEX.REY6 GTEX.RM2N GTEX.RNOR GTEX.RTLS GTEX.RU72 GTEX.RUSQ GTEX.RVPV GTEX.RWSA GTEX.S32W GTEX.S33H GTEX.S4P3 GTEX.S7SE GTEX.S95S GTEX.SIU8 GTEX.SJXC GTEX.SN8G GTEX.SNMC GTEX.SNOS GTEX.SSA3 GTEX.SUCS GTEX.T2IS GTEX.T5JC GTEX.T5JW GTEX.T6MN GTEX.T6MO GTEX.TKQ1 GTEX.TML8 GTEX.TMZS GTEX.TSE9 GTEX.U3ZM GTEX.U3ZN GTEX.U412 GTEX.U4B1 GTEX.U8XE GTEX.UJHI GTEX.UTHO GTEX.VJWN GTEX.VJYA GTEX.VUSG GTEX.VUSH GTEX.W5WG GTEX.W5X1 GTEX.WEY5 GTEX.WFG7 GTEX.WFG8 GTEX.WFJO GTEX.WFON GTEX.WH7G GTEX.WHPG GTEX.WI4N GTEX.WL46 GTEX.WVJS GTEX.WVLH GTEX.WY7C GTEX.WYJK GTEX.X15G GTEX.X261 GTEX.X4EO GTEX.X4LF GTEX.X4XY GTEX.X5EB GTEX.X638 GTEX.X88G GTEX.X8HC GTEX.XAJ8 GTEX.XBEC GTEX.XBED GTEX.XBEW GTEX.XGQ4 GTEX.XK95 GTEX.XLM4 GTEX.XMD2 GTEX.XMK1 GTEX.XOT4 GTEX.XPVG GTEX.XQ3S GTEX.XQ8I GTEX.XUW1 GTEX.XUYS GTEX.XUZC GTEX.XV7Q GTEX.XXEK GTEX.XYKS GTEX.Y111 GTEX.Y114 GTEX.Y3I4 GTEX.Y3IK GTEX.Y5V5 GTEX.Y5V6 GTEX.Y8E4 GTEX.Y8E5 GTEX.Y8LW GTEX.Y9LG GTEX.YB5E GTEX.YB5K GTEX.YEC3 GTEX.YEC4 GTEX.YECK GTEX.YF7O GTEX.YFC4 GTEX.YFCO GTEX.YJ89 GTEX.YJ8A GTEX.YJ8O GTEX.Z93S GTEX.Z93T GTEX.Z9EW GTEX.ZAJG GTEX.ZAK1 GTEX.ZC5H GTEX.ZDTS GTEX.ZDTT GTEX.ZDXO GTEX.ZDYS GTEX.ZE7O GTEX.ZE9C GTEX.ZEX8 GTEX.ZF28 GTEX.ZF29 GTEX.ZF3C GTEX.ZGAY GTEX.ZLFU GTEX.ZLV1 GTEX.ZLWG GTEX.ZP4G GTEX.ZPCL GTEX.ZPIC GTEX.ZPU1 GTEX.ZQG8 GTEX.ZT9X GTEX.ZTPG GTEX.ZTSS GTEX.ZTX8 GTEX.ZUA1 GTEX.ZV68 GTEX.ZVE2 GTEX.ZVP2 GTEX.ZVT2 GTEX.ZVT4 GTEX.ZVTK GTEX.ZVZO GTEX.ZVZP GTEX.ZXES GTEX.ZXG5 GTEX.ZYFC GTEX.ZYFD GTEX.ZYT6 GTEX.ZYVF GTEX.ZYW4 GTEX.ZYY3 GTEX.ZZ64 GTEX.ZZPU
WASH7P 169 81 108 89 89 68 102 110 70 80 66 61 123 106 157 106 120 116 46 105 101 68 55 123 163 86 224 112 115 126 54 159 61 218 144 132 79 100 89 71 90 87 85 155 99 102 102 95 63 5 149 163 167 56 164 115 74 77 150 68 118 99 125 170 145 97 171 70 86 76 108 127 89 143 101 107 86 102 72 58 121 87 104 83 82 176 111 96 138 144 178 97 130 100 163 164 170 141 94 110 126 68 168 106 124 138 139 123 133 65 74 72 102 138 152 138 71 83 109 112 155 61 108 125 160 98 71 92 106 79 102 124 116 80 82 84 120 84 119 35 134 133 113 82 121 159 84 117 111 40 147 176 197 131 129 154 96 90 153 128 70 90 175 63 84 48 117 72 161 62 54 117 137 106 135 116 108 79 136 103 93 85 100 143 166 120 76 115 66 203 66 95 44 100 74 89 90 89 107 134 59 81 42 100 86 82 53 98 143 125 119 120 105 101 76 133 78 85 87 36 86 84 109 152 125 111 99 94 132 117 110 104 102 156 104 56 80 115 106 210 68 130 130 90 151 56 76 126 53 163 113 81 96 166 80 83 89 90 135 37 0 107 142 111 123 106 103 128 44 134 146 160 43 111 71 59 110 130 41 178 77 174 147 181 85 63 126 94 140 73 112 95 135 88 98 142 93 79 117 78 197 102 139 47 11 95 59 102 113 103 125 163 122 64 99 135 117 106 77 59 123 153 63 123 137 82 139 95 143 83 103 160 202 140 107 92 188 79 116 93 163 61 107 143 89 82 76 160 126 111 164 162 68 142 132 161 70 151 198 169 159 103 71 64 139 105 141 73 113 41 134 143 78 153 59 165 187 133 70 95 33 110 118 79 119 110 111 84 159 117 165 194 163 121 84 90 136 95 81 124 135 39 98 124 132 133 178 76 157 87 118 57 112 125 123 88 139 157 180 94 163 66 114 131 43 114 131 133 65 117 116 92 112 121 83 111 74 104 97 39 50 108 138 155 93 88 72 110 128 185 63 124 126 112 119 84 80 144 175 80 62 144 192 77 88 83 32 117 161 133 117 129 102 137 172 134 54 92 81 120 108 131 87 76 102 23 125 29 57 95 141 126 152 142 56 111 5 24 128 78 90 97 99 58 149 74 160 64 58 97 101 94 38 58 72 71 124 57 136 72 58 93 97 168 129 155 123 106 138 146 61 76 130 156 190 31 159 84 73 110 112 106 86 167 120 39 116 124 155 205 62 75 85 127 66 102 109 75 136 199 72 59 139 86 119 174 110 117 146 228 157 177 109 158 148 101 132 135 76 84 145 101 83 103 122 136 227 126 114 107 129 95 149 88 114 129 84 125 141 57 91 90 97 128 125 94 113 85 99 136 68 146 110 120 109 124 54 119 90 43 108 102 123 114 121 159 71 110 133 151 105 198 96 111 95 148 68 64 97 130 89 106 66 106 180 40 90 132 104 109 92 137 192 155 101 74 33 120 122 53
RP11.34P13.15 104 155 59 255 119 131 205 55 74 56 119 153 70 113 145 150 119 89 241 114 182 128 46 193 136 88 104 155 77 105 138 48 173 213 135 64 125 108 169 183 182 94 110 55 101 203 95 155 98 81 126 83 100 155 122 130 83 86 72 74 185 83 53 95 125 195 186 218 156 93 155 141 83 121 162 136 94 70 143 99 115 116 155 74 227 122 82 73 93 125 94 129 131 91 138 82 211 83 89 245 123 85 149 126 171 61 204 183 78 44 47 139 108 295 142 103 175 102 94 161 160 221 109 98 137 71 76 129 198 320 192 108 133 121 137 143 62 34 24 149 54 124 185 157 73 116 86 69 123 121 109 349 89 105 129 143 43 109 176 86 183 119 67 121 101 86 105 83 78 146 84 98 112 74 68 150 64 82 180 117 135 92 89 207 104 117 93 40 40 87 144 95 53 94 150 281 116 53 162 126 107 135 79 144 126 29 94 114 239 122 93 214 143 91 146 115 93 114 126 171 121 85 108 75 85 41 138 84 177 76 78 89 72 100 68 132 143 172 86 183 160 118 51 108 141 69 129 90 96 86 188 151 78 121 180 102 144 120 96 127 77 145 90 101 92 27 188 138 188 128 114 42 53 136 212 144 60 69 109 128 107 60 307 185 85 127 93 116 72 108 91 224 141 116 125 107 30 97 136 132 29 224 44 45 232 47 76 148 83 82 108 151 158 200 89 84 138 146 98 253 73 86 38 123 136 158 156 63 160 51 129 96 26 68 129 83 311 78 60 141 193 90 77 99 98 62 55 190 61 60 318 236 114 239 68 125 168 87 54 61 24 163 51 238 138 135 110 142 75 52 121 90 81 186 123 66 43 135 47 63 269 161 216 81 100 153 103 118 0 94 115 130 59 74 191 192 76 70 51 67 149 78 125 177 110 74 36 46 59 62 70 156 104 35 135 129 114 96 128 52 103 211 89 171 280 306 25 108 37 41 122 149 83 206 132 247 110 232 78 102 118 84 95 201 82 194 71 108 147 93 177 56 147 78 207 92 64 231 161 64 113 174 70 229 105 136 123 56 201 88 220 92 131 102 177 92 180 167 86 178 109 112 84 77 194 149 132 80 195 172 226 94 159 146 85 158 64 122 117 61 120 102 167 113 61 104 55 180 214 112 112 115 154 125 124 244 151 53 197 101 164 95 108 227 42 97 57 161 131 133 111 90 182 57 62 175 117 104 99 30 118 139 66 119 198 133 242 128 178 58 70 201 147 158 101 88 153 115 58 105 107 121 83 99 48 112 103 116 103 134 265 194 91 192 127 100 136 126 137 221 101 133 125 67 91 134 68 149 183 111 64 77 131 177 110 102 133 37 76 110 165 95 51 83 110 79 83 129 186 84 62 192 44 100 94 166 68 215 26 81 142 128 154 72 86 55 158 205 93 118 148 89 98 138 106 126 94 159 201 116 72 142 116 48 97 140 210 28 206 95 88 67 60 152 41 89 121 206 83 176
RP11.34P13.16 83 117 36 206 96 107 139 10 32 34 75 122 38 54 94 72 63 50 199 71 94 103 33 113 41 64 68 68 36 39 76 23 135 170 79 33 89 65 72 104 130 88 95 9 72 71 78 111 66 32 65 86 58 93 108 79 65 44 24 33 151 65 43 58 70 126 126 179 75 83 122 68 33 85 101 98 49 71 108 41 88 51 102 49 168 77 64 37 24 84 72 114 53 48 84 45 142 43 50 198 67 56 67 67 126 28 133 106 51 18 17 133 96 207 131 54 114 73 43 116 107 156 75 62 78 37 52 101 118 225 153 28 92 90 69 94 42 7 20 92 23 60 145 133 31 69 60 20 54 86 43 207 46 75 92 67 22 51 145 60 121 85 26 85 52 65 66 25 26 108 44 35 64 42 52 95 37 53 168 73 108 41 59 186 40 61 52 38 32 66 82 57 30 33 74 191 66 29 105 69 74 89 57 102 60 6 72 90 170 64 74 184 101 33 111 68 54 86 61 126 102 40 69 32 26 36 76 60 99 46 65 71 16 68 17 135 89 117 87 91 75 37 53 93 107 57 86 77 76 29 141 134 21 57 140 59 93 111 70 51 43 111 76 64 73 32 180 72 135 113 93 21 46 98 131 122 13 48 44 74 82 42 251 112 38 64 57 68 15 52 49 161 97 77 75 45 23 74 53 79 17 158 39 30 192 41 62 86 41 29 79 100 101 130 57 61 65 108 74 186 48 60 14 104 100 115 111 42 109 18 79 55 28 25 90 76 226 42 46 90 132 52 42 59 54 43 16 134 47 14 231 142 80 164 69 86 87 61 41 24 4 91 36 151 57 83 46 107 49 45 57 55 33 120 85 51 19 85 33 21 231 63 109 64 87 95 41 79 0 66 80 53 43 21 135 139 59 35 28 51 85 37 94 116 108 78 20 20 48 25 50 109 64 18 54 91 57 33 72 24 82 136 35 94 185 218 8 74 22 7 44 111 57 149 59 188 53 154 20 70 74 44 48 121 44 144 73 131 89 50 113 17 87 75 151 51 36 185 77 47 61 116 35 188 53 63 94 68 175 25 172 38 79 31 121 52 115 118 32 113 78 56 44 42 151 91 84 88 206 141 176 66 123 78 51 87 53 125 117 79 74 68 154 48 59 53 54 139 151 99 94 68 98 78 50 154 67 46 109 37 88 31 28 133 33 59 16 92 104 105 73 46 115 32 22 107 72 71 33 23 114 90 24 61 115 85 159 68 97 33 39 176 103 101 83 59 94 105 31 53 94 74 53 64 22 74 62 92 56 59 150 146 82 111 60 64 77 58 76 156 45 104 63 24 84 87 20 103 114 66 18 43 77 130 62 53 55 23 62 56 118 43 39 42 51 36 53 74 117 40 50 99 31 58 45 120 42 154 32 43 83 60 57 23 62 11 111 97 73 44 87 65 39 63 68 57 70 90 105 74 38 66 93 24 49 119 139 10 141 59 66 26 29 24 29 86 78 163 33 134
RP11.34P13.13 65 33 42 88 53 94 122 21 90 15 49 31 55 107 123 89 39 89 49 32 38 43 61 47 20 84 14 130 31 14 44 24 84 65 47 25 76 37 26 33 63 104 83 44 34 22 64 44 41 23 82 100 34 35 55 32 68 33 17 24 17 47 48 57 65 11 34 52 57 19 70 21 14 85 32 106 39 90 69 27 102 8 57 52 103 29 28 71 66 70 29 82 24 137 33 68 86 48 68 94 22 60 81 86 66 15 80 38 35 28 25 59 40 50 73 64 95 26 78 74 99 63 61 33 27 23 35 29 22 54 65 55 36 37 66 72 84 29 32 27 23 71 94 51 42 8 75 18 21 11 79 84 92 53 84 105 58 53 50 56 44 13 17 33 45 24 76 61 33 53 88 35 55 19 14 36 29 62 132 25 60 73 69 59 33 32 106 22 48 50 70 19 56 57 86 41 22 26 62 48 29 70 39 97 39 11 111 112 38 47 20 149 23 21 48 50 36 56 16 130 38 45 13 41 90 65 48 41 79 60 56 41 48 189 42 67 53 103 45 34 25 44 43 75 63 18 74 63 129 41 53 96 131 30 40 37 38 29 112 56 42 68 56 119 86 34 97 48 42 91 24 24 75 71 59 48 76 128 19 13 52 46 121 47 38 65 41 46 11 20 96 69 43 53 47 29 23 176 20 34 5 42 31 26 29 32 65 67 26 62 118 43 35 49 26 130 22 105 29 95 22 48 47 78 61 65 56 29 44 52 20 49 43 27 44 14 74 35 82 32 90 28 17 12 57 36 69 84 20 20 97 23 44 68 128 20 36 72 33 20 23 111 69 77 73 106 64 49 52 118 45 46 66 28 13 63 25 16 32 26 48 23 48 35 185 59 158 51 34 49 44 48 76 43 18 74 73 24 72 58 63 16 28 52 30 9 3 41 56 79 25 57 42 26 12 55 52 34 116 20 92 51 44 39 43 147 7 46 52 71 69 187 90 109 18 92 9 73 26 18 39 85 41 38 56 62 73 63 90 101 22 54 20 79 37 67 24 138 142 58 45 160 38 95 79 64 119 47 51 147 138 48 61 34 41 26 20 50 75 13 106 22 40 27 34 41 29 66 135 36 24 27 28 76 65 109 58 106 72 77 75 47 61 33 144 80 67 57 139 50 101 25 42 85 70 100 48 136 21 89 25 39 17 73 25 31 74 25 24 24 13 62 57 60 41 50 62 54 42 37 102 63 57 36 18 112 30 30 6 44 100 50 48 68 50 84 114 67 60 34 54 69 111 23 17 28 33 30 42 29 30 76 84 25 44 8 32 0 56 101 47 92 46 36 142 78 31 49 46 23 15 74 43 49 63 26 90 55 34 28 48 37 20 60 64 96 36 128 61 84 39 98 50 62 72 63 34 33 101 48 24 44 60 5 112 87 70 43 101 28 64 101 52 32 75 34 92 71 45 45 36 55 26 97 21 28 42 64 79 40 154 17 27 12 39 32 56 56 21 75
RP11.34P13.18 148 95 140 87 32 97 93 15 133 130 107 108 178 85 100 193 110 135 121 106 131 31 11 168 168 86 84 121 33 38 146 173 97 64 135 60 161 166 86 184 159 99 116 74 163 110 140 109 85 135 146 175 154 66 173 167 136 110 173 87 56 82 155 57 119 122 148 53 98 163 172 119 130 145 134 94 135 121 106 117 30 154 123 142 101 132 169 162 110 127 125 128 57 111 188 42 229 201 142 164 94 73 156 38 158 121 132 174 117 150 174 140 116 179 112 76 163 92 109 122 166 38 110 154 134 35 117 138 127 77 67 70 94 146 132 73 59 165 43 71 146 58 87 137 139 129 130 108 129 120 132 159 147 87 204 114 92 171 125 164 97 62 139 38 74 92 78 50 93 131 129 130 152 147 191 94 136 99 174 161 161 100 136 131 49 145 71 144 159 156 100 156 18 124 147 149 124 83 137 175 80 136 65 58 81 139 75 196 125 181 90 58 180 124 58 128 102 136 122 56 90 140 116 136 143 58 76 158 51 131 140 105 113 153 87 126 141 89 52 74 102 133 31 95 112 135 137 121 102 179 149 157 160 153 102 84 39 63 140 101 112 172 153 72 142 92 127 107 93 165 134 150 21 158 105 81 159 141 62 205 159 104 192 125 105 77 77 155 133 85 143 103 157 116 136 139 107 125 123 133 158 123 78 58 195 75 94 82 128 106 129 146 162 93 147 110 101 94 102 117 138 144 46 209 90 120 118 74 90 110 81 152 114 136 55 134 159 28 156 3 94 100 55 113 68 112 160 146 178 158 140 172 39 110 97 103 74 133 87 109 76 92 114 97 160 68 171 121 177 157 114 88 63 165 87 203 148 176 121 117 67 70 188 117 112 98 68 104 188 104 186 116 85 151 190 132 147 141 146 177 90 92 125 96 147 216 38 36 182 116 55 75 63 113 124 103 84 158 129 44 100 101 136 137 101 76 72 182 38 121 79 108 95 98 194 110 91 145 55 82 50 96 93 57 107 129 136 24 118 122 115 182 124 97 128 25 169 101 195 164 177 89 132 153 155 149 124 213 74 111 92 136 86 79 163 133 154 130 36 105 66 95 58 73 160 78 140 162 129 135 109 93 152 55 50 60 90 158 161 175 155 240 114 88 65 163 177 136 142 133 58 145 137 141 136 91 60 88 144 93 83 68 163 101 96 108 144 91 33 131 27 87 123 125 106 138 128 143 99 43 136 123 79 192 140 66 97 79 85 170 76 135 93 46 189 115 76 32 124 230 155 110 185 130 137 93 188 108 132 121 143 84 198 183 103 151 193 67 158 160 123 195 114 181 137 124 137 166 167 169 101 131 79 128 156 82 135 112 87 107 104 81 116 127 115 114 98 148 130 168 53 169 69 58 135 156 141 68 169 0 73 86 110 109 57 123 60 127 160 144 102 132 120 64 176 65 82 90 76 157 139 122 130 79 81 98 90 69 138 107 140 176 76 174 134 99 143 146 150 77
AP006222.2 96 142 150 23 122 88 50 47 41 217 34 138 98 168 275 80 178 138 286 208 120 185 180 73 158 107 215 216 103 161 135 80 112 134 164 233 59 168 69 135 159 162 127 180 116 125 159 271 32 267 6 97 175 49 90 167 184 83 134 126 236 95 78 146 190 158 206 159 125 101 57 160 112 64 70 130 82 74 133 187 9 74 187 54 104 180 106 28 96 70 101 90 164 95 104 116 128 193 41 190 257 112 95 103 214 82 80 93 235 124 199 167 222 357 96 229 71 180 104 171 0 51 110 219 100 87 111 278 245 233 37 188 216 59 54 90 137 106 195 94 23 116 42 155 193 159 179 104 148 100 35 131 29 207 267 155 129 101 257 37 157 197 45 79 119 57 138 70 94 188 125 77 190 172 159 170 136 69 145 214 130 34 173 200 112 278 75 70 122 225 30 164 23 162 214 310 92 122 207 146 268 115 148 144 175 94 76 96 287 236 238 248 325 105 127 137 207 62 135 143 146 120 103 235 13 61 178 89 67 8 160 104 54 103 132 141 167 69 82 148 166 151 111 95 191 67 90 62 11 138 349 82 23 148 112 201 99 38 201 246 7 160 112 6 134 71 201 123 260 22 164 127 118 157 91 201 103 174 189 127 133 71 109 102 263 106 70 79 149 193 87 160 248 187 106 171 117 15 128 123 47 63 53 42 73 9 210 233 109 161 60 104 95 152 68 272 69 245 94 30 206 227 183 243 151 65 98 68 159 133 19 158 32 159 278 206 141 186 271 210 140 145 126 90 55 123 11 82 120 172 132 71 107 37 205 59 162 74 274 170 30 32 106 13 94 153 172 185 98 92 155 70 5 236 118 294 105 181 27 73 200 50 164 263 16 56 97 88 137 101 126 168 14 112 183 207 183 36 106 188 120 76 92 121 155 68 166 154 243 51 107 92 191 50 54 148 159 117 21 174 195 299 118 241 151 270 112 108 79 57 80 16 69 95 23 192 213 150 104 190 119 165 176 190 214 139 125 190 37 166 145 159 86 102 165 97 57 152 83 40 57 136 171 86 40 171 23 142 6 208 108 108 37 50 85 152 94 80 81 121 55 53 99 125 281 245 71 154 40 193 170 112 13 139 93 155 117 99 152 261 96 201 181 61 110 88 12 217 107 246 117 265 206 32 99 236 142 136 59 206 120 106 181 109 191 144 194 152 238 167 35 98 194 1 225 202 133 192 234 89 82 23 68 96 118 235 157 174 125 176 24 193 184 95 167 115 17 39 96 86 129 88 45 246 69 198 152 18 198 105 86 193 122 108 76 153 254 76 238 53 242 35 146 120 9 91 261 191 91 160 67 76 149 93 156 153 95 103 143 103 19 304 190 130 110 136 79 127 78 50 88 25 217 233 61 110 197 131 167 65 136 52 23 54 100 146 117 170 224 124 105 107 175 133 106 190 90 239 139 185 92 260 138 53 99 229 98 196 91 118 198 133 204 80 121 60 89 192 53 236
Code
dim(RNA_counts_adjusted)
[1] 18749   660
Code
head(covariates_original)
SUBJID SEX AGE_DECADE HARDY_SCALE RNA_INTEGRITY_NUMBER ISCHEMIC_TIME PATHOLOGY_CATEGORIES PATHOLOGY_NOTES HISTO_DI RNA_DI
GTEX-1117F 2 60-69 4 6.8 1214 2 pieces, ~15% vessel stroma, rep delineated -0.410 0.075
GTEX-111CU 1 50-59 0 7.5 138 2 pieces, small portion of nerve (<10% of one piece) -0.510 0.086
GTEX-111FC 1 60-69 1 7.3 1040 2 pieces, larger piece is 30% fibrovascular tissue and nerve, smaller piece ~10% 0.787 0.834
GTEX-111VG 1 60-69 3 7.7 1083 2 pieces -0.296 -0.207
GTEX-111YS 1 60-69 0 6.6 296 3 small irregular pieces -0.489 -0.174
GTEX-1122O 2 60-69 0 6.3 126 2 pieces 1.030 0.726
Code
str(covariates_original)
'data.frame':   660 obs. of  10 variables:
 $ SUBJID              : chr  "GTEX-1117F" "GTEX-111CU" "GTEX-111FC" "GTEX-111VG" ...
 $ SEX                 : int  2 1 1 1 1 2 2 1 1 1 ...
 $ AGE_DECADE          : chr  "60-69" "50-59" "60-69" "60-69" ...
 $ HARDY_SCALE         : int  4 0 1 3 0 0 2 NA 0 2 ...
 $ RNA_INTEGRITY_NUMBER: num  6.8 7.5 7.3 7.7 6.6 6.3 6.5 6.7 7.4 7.3 ...
 $ ISCHEMIC_TIME       : int  1214 138 1040 1083 296 126 882 96 144 1092 ...
 $ PATHOLOGY_CATEGORIES: chr  "" "" "" "" ...
 $ PATHOLOGY_NOTES     : chr  "2 pieces, ~15% vessel stroma, rep delineated" "2 pieces, small portion of nerve (<10% of one piece)" "2 pieces, larger piece is 30% fibrovascular tissue and nerve, smaller piece ~10%" "2 pieces" ...
 $ HISTO_DI            : num  -0.41 -0.51 0.787 -0.296 -0.489 ...
 $ RNA_DI              : num  0.0747 0.0859 0.8343 -0.2069 -0.1737 ...

There are 12 samples with missing value of the hardy scale. What to do with it?

Code
is.na(covariates_original) |> sum()
[1] 12
Code
is.na(covariates_original$HARDY_SCALE) |> sum()
[1] 12

There are counts for 18749 genes measured in 660 samples.

We start by creating a copy of the covariates data frame that we can modify:

Code
covariates <- covariates_original

Replace the “.” in column of so RNA_counts_adjusted so that they match the SUBJIDfrom covariates.

Code
colnames(RNA_counts_adjusted) <- gsub(".", "-", colnames(RNA_counts_adjusted), fixed = TRUE)

Replace SEX column by “M” for male, and “F” for female

Code
covariates$SEX[covariates$SEX == "1"] <- "M"
covariates$SEX[covariates$SEX == "2"] <- "F"

Replace the blank PATHOLOGY_CATEGORIES by something more explicit:

Code
covariates$PATHOLOGY_CATEGORIES[covariates$PATHOLOGY_CATEGORIES == ""] <- "unspecified"

AGE_DECADE: replace by a number instead? –> ex: 60-69 -> we should probably replace by 65 (if we assume that the age in that range is normally distributed)

Code
age_continuous <- substr(covariates$AGE_DECADE, start = 1, stop = 1) |> paste0("5") |> as.numeric()

covariates <- covariates |> 
  mutate(AGE_CONTINUOUS = age_continuous)

Replace the correct variables by factors:

Code
covariates <- covariates |> 
  mutate(SEX = factor(SEX),
         HARDY_SCALE = factor(HARDY_SCALE),
         PATHOLOGY_CATEGORIES = factor(PATHOLOGY_CATEGORIES)
         )

covariates |>  str()
'data.frame':   660 obs. of  11 variables:
 $ SUBJID              : chr  "GTEX-1117F" "GTEX-111CU" "GTEX-111FC" "GTEX-111VG" ...
 $ SEX                 : Factor w/ 2 levels "F","M": 1 2 2 2 2 1 1 2 2 2 ...
 $ AGE_DECADE          : chr  "60-69" "50-59" "60-69" "60-69" ...
 $ HARDY_SCALE         : Factor w/ 5 levels "0","1","2","3",..: 5 1 2 4 1 1 3 NA 1 3 ...
 $ RNA_INTEGRITY_NUMBER: num  6.8 7.5 7.3 7.7 6.6 6.3 6.5 6.7 7.4 7.3 ...
 $ ISCHEMIC_TIME       : int  1214 138 1040 1083 296 126 882 96 144 1092 ...
 $ PATHOLOGY_CATEGORIES: Factor w/ 9 levels "calcification",..: 9 9 9 9 9 9 9 9 9 9 ...
 $ PATHOLOGY_NOTES     : chr  "2 pieces, ~15% vessel stroma, rep delineated" "2 pieces, small portion of nerve (<10% of one piece)" "2 pieces, larger piece is 30% fibrovascular tissue and nerve, smaller piece ~10%" "2 pieces" ...
 $ HISTO_DI            : num  -0.41 -0.51 0.787 -0.296 -0.489 ...
 $ RNA_DI              : num  0.0747 0.0859 0.8343 -0.2069 -0.1737 ...
 $ AGE_CONTINUOUS      : num  65 55 65 65 65 65 65 65 55 45 ...

2 Q1: Descriptive analysis of the clinical and technical variables, and of the dimoprphism indices

Distribution: - Histograms of the variables

Code
colnames(covariates)
 [1] "SUBJID"               "SEX"                  "AGE_DECADE"          
 [4] "HARDY_SCALE"          "RNA_INTEGRITY_NUMBER" "ISCHEMIC_TIME"       
 [7] "PATHOLOGY_CATEGORIES" "PATHOLOGY_NOTES"      "HISTO_DI"            
[10] "RNA_DI"               "AGE_CONTINUOUS"      
Code
plot_sex <- ggplot(covariates, aes(x = SEX,fill = SEX)) +
  geom_bar() +
  xlab("Sex")

plot_age_decade <- ggplot(covariates, aes(x = AGE_DECADE, fill = SEX)) +
  geom_bar(position = "dodge") +
  xlab("Age decade")

plot_hardy_scale <- ggplot(covariates, aes(x = HARDY_SCALE, fill = SEX)) +
  geom_bar(position = "dodge") +
  xlab("Hardy scale")

plot_rni <- ggplot(covariates, aes(x = RNA_INTEGRITY_NUMBER, fill = SEX)) +
  geom_histogram(bins = 20, position = "dodge") +
  xlab("RNA Integrity Number")

plot_ischemic_time <- ggplot(covariates, aes(x = ISCHEMIC_TIME, fill = SEX)) +
  geom_histogram(bins = 30, position = "dodge") +
  xlab("Ischemic time")

plot_pathology_categories <- ggplot(covariates, aes(x = PATHOLOGY_CATEGORIES, fill = SEX)) +
  geom_bar(position = "dodge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 10)) +
  xlab("Pathology categories")

plot_histo_di <- ggplot(covariates, aes(x = HISTO_DI, fill = SEX)) +
  geom_histogram(position = "dodge", bins = 30) +
  xlab("Dimorphism indices from histology profiles")

plot_rna_di <- ggplot(covariates, aes(x = RNA_DI, fill = SEX)) +
  geom_histogram(bins = 30, position = "dodge") +
  xlab("Dimorphism indices from RNA-seq transcriptomes")


(plot_sex + plot_age_decade + plot_hardy_scale) / 
  (plot_rni + plot_ischemic_time) +
  (free(plot_pathology_categories) + (plot_histo_di / plot_rna_di)) +
  plot_annotation(tag_levels = "a") +
  plot_layout(guides = "collect")
Figure 1: distribution of the covariates

Correlation: - Correlation of the variables with one another: for correlation between two categorical variables, the Chi-square test of independence is commonly used - For continuous variables: pairs plot + calculate Pearson correlation. - ANOVA for link between categorical and continuous variables??

Code
covariates |>
  select(AGE_CONTINUOUS, RNA_INTEGRITY_NUMBER, ISCHEMIC_TIME, HISTO_DI, RNA_DI) |> 
  ggpairs(aes(col = covariates$SEX, alpha = 0.5), progress = FALSE)

Confounding variables (confounding for dimorphism indices and clinical variables): - Look at RNA integrity and ischemic time - do a PCA on the counts and color the points by a technical factor: see if we observe a gradient. example for detecting batch effect

To estimate the size factors and the vst (variance stabilizing transformation) with DESeq2: every gene contains at least one zero, cannot compute log geometric means. Solution: add a pseudo-count of 1 to every entry:

Code
mt <- match(colnames(RNA_counts_adjusted), covariates$SUBJID)

RNA_counts_ds <- DESeqDataSetFromMatrix(
  countData = RNA_counts_adjusted + 1,
  colData = covariates[mt, ],
  design = ~ 1 # no design yet, as we have to determine the confounding factors
)
Code
vst_counts <- vst(RNA_counts_ds)

plotPCA(vst_counts, intgroup = c("SEX"), ntop = 500) +
  coord_fixed() +
  aes(alpha = 0.8) +
  scale_color_discrete(name = "Sex")

Code
plotPCA(vst_counts, intgroup = c("SEX"), ntop = 200) +
  coord_fixed() +
  geom_point(alpha = 0.5) +
  scale_color_discrete(name = "Sex")

Code
PCA_rnadi_M <- vst_counts[,colData(vst_counts)$SEX == "M"] |> 
  plotPCA(intgroup = c("RNA_DI"), ntop = 500) +
  scale_color_continuous(type = "viridis", name = "RNA DI") +
  coord_fixed() +
  ggtitle("Males")

PCA_rnadi_F <- vst_counts[,colData(vst_counts)$SEX == "F"] |> 
  plotPCA(intgroup = c("RNA_DI"), ntop = 500) +
  scale_color_continuous(type = "viridis", name = "RNA DI") +
  coord_fixed() +
  ggtitle("Females")

PCA_rnadi_M + PCA_rnadi_F + plot_annotation(tag_levels = "a") + 
  plot_layout(guides = "collect")  & theme(legend.position = 'bottom')
Figure 2: PCA individuals map of the samples colored by their dimorphism index. (a) Male samples (b) Female samples.
Code
PCA_histodi_M <- vst_counts[,colData(vst_counts)$SEX == "M"] |> 
  plotPCA(intgroup = c("HISTO_DI"), ntop = 500) +
  scale_color_continuous(type = "viridis", name = "HISTO DI") +
  coord_fixed() +
  ggtitle("Males")

PCA_histodi_F <- vst_counts[,colData(vst_counts)$SEX == "F"] |> 
  plotPCA(intgroup = c("HISTO_DI"), ntop = 500) +
  scale_color_continuous(type = "viridis", name = "HISTO DI") +
  coord_fixed() +
  ggtitle("Females")

PCA_histodi_M + PCA_histodi_F + plot_annotation(tag_levels = "a") + 
  plot_layout(guides = "collect")  & theme(legend.position = 'bottom')
Figure 3: PCA individuals map of the samples colored by their histological dimorphism index. (a) Male samples (b) Female samples.
Code
plotPCA(vst_counts, intgroup = c("AGE_CONTINUOUS"), ntop = 500) +
  scale_color_continuous(type = "viridis", name = "Age (continuous)") +
  coord_fixed()

Code
PCA_age_M <- plotPCA(vst_counts[,colData(vst_counts)$SEX == "M"], intgroup = c("AGE_CONTINUOUS"), ntop = 500, pcsToUse = 1:2) +
  scale_color_continuous(type = "viridis", name = "Age (continuous)") +
  coord_fixed() +
  ggtitle("Males")

PCA_age_F <- plotPCA(vst_counts[,colData(vst_counts)$SEX == "F"], intgroup = c("AGE_CONTINUOUS"), ntop = 500, pcsToUse = 1:2) +
  scale_color_continuous(type = "viridis", name = "Age (continuous)") +
  coord_fixed() +
  ggtitle("Females")

PCA_age_M + PCA_age_F + plot_annotation(tag_levels = "a") + 
  plot_layout(guides = "collect") & theme(legend.position = 'bottom')

Women of age below 40 seem to cluster more to the right of the plot –> possible confounding factor.

Code
plotPCA(vst_counts, intgroup = c("RNA_INTEGRITY_NUMBER"), ntop = 500) +
  scale_color_continuous(type = "viridis", name = "RNA integrity number") +
  coord_fixed()

Code
 plotPCA(vst_counts[,colData(vst_counts)$SEX == "M"], intgroup = c("RNA_INTEGRITY_NUMBER"), ntop = 500, pcsToUse = 1:2) +
  scale_color_continuous(type = "viridis", name = "RIN") +
  coord_fixed()

Code
  plotPCA(vst_counts[,colData(vst_counts)$SEX == "F"], intgroup = c("RNA_INTEGRITY_NUMBER"), ntop = 500, pcsToUse = 1:2) +
  scale_color_continuous(type = "viridis", name = "RIN") +
  coord_fixed()

If we take a lot of top genes (selected by row variance) into account for the PCA, the samples seem to form clusters based on the Hardy scale value.

Code
plotPCA(vst_counts, intgroup = c("HARDY_SCALE"), ntop = 10000) +
  coord_fixed() +
  aes(alpha = 0.5) +
  scale_color_discrete(name = "Hardy scale")

Code
plotPCA(vst_counts[,colData(vst_counts)$PATHOLOGY_CATEGORIES != "unspecified"], intgroup = c("PATHOLOGY_CATEGORIES"), ntop = 500, pcsToUse = 1:2) +
  coord_fixed() +
  aes(alpha = 0.8) +
  scale_color_discrete(name = "Pathology categories") +
  ggtitle("PCA - individuals' map with samples colored by pathology category", subtitle = "Unspecified pathology categories are removed")

Code
plotPCA(vst_counts, intgroup = c("PATHOLOGY_CATEGORIES"), ntop = 500, pcsToUse = 1:2) +
  coord_fixed() +
  aes(alpha = 0.8) +
  scale_color_discrete(name = "Pathology categories")

Code
library("cowplot")

pcaData <- plotPCA(vst_counts, intgroup = c("HARDY_SCALE"), ntop = 10000, returnData = TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))



# Create the PCA plot
p <- ggplot(pcaData, aes(PC1, PC2, color = HARDY_SCALE)) + 
  geom_point(size = 3, alpha = 0.5) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal()

# Create density plots for the x and y axes
xdens <- axis_canvas(p, axis = "x") +
  geom_histogram(data = pcaData, aes(x = PC1, fill = HARDY_SCALE), alpha = 0.7, position = "identity")

ydens <- axis_canvas(p, axis = "y", coord_flip = TRUE) +
  geom_histogram(data = pcaData, aes(x = PC2, fill = HARDY_SCALE), alpha = 0.7, position = "identity") +
  coord_flip()

# Combine the PCA plot with the density plots
p_combined <- insert_xaxis_grob(p, xdens, grid::unit(1, "in"), position = "top") %>%
  insert_yaxis_grob(ydens, grid::unit(1, "in"), position = "right") %>%
  ggdraw()

# Display the combined plot
print(p_combined)

Code
pcaData <- plotPCA(vst_counts[,colData(vst_counts)$SEX == "F"], intgroup = c("AGE_DECADE"), ntop = 500, returnData = TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))

# Create the PCA plot
p <- ggplot(pcaData, aes(PC1, PC2, color = AGE_DECADE)) + 
  geom_point(size = 3) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  theme_minimal()

# Create density plots for the x and y axes
xdens <- axis_canvas(p, axis = "x") +
  geom_density(data = pcaData, aes(x = PC1, fill = AGE_DECADE), alpha = 0.5)

ydens <- axis_canvas(p, axis = "y", coord_flip = TRUE) +
  geom_density(data = pcaData, aes(x = PC2, fill = AGE_DECADE), alpha = 0.5) +
  coord_flip()

# Combine the PCA plot with the density plots
p_combined <- insert_xaxis_grob(p, xdens, grid::unit(1, "in"), position = "top") %>%
  insert_yaxis_grob(ydens, grid::unit(1, "in"), position = "right") %>%
  ggdraw()

# Display the combined plot
print(p_combined)
Figure 4: Individual’s plot of the PCA performed on the female samples only. The samples are colored by their age at death (in decades).

Conclusion: technical variables that will be taken into account: SEX, RNA_DI, AGE_CONTINUOUS, HARDY_SCALE. Contrasts that were are most interested in: “SEX_M_vs_”F (differentially expressed genes in males vs. females), “SEXF.RNA_DI” (to identify the genes associated with the female dimorphism index), and “SEXM.RNA_DI” (to find the genes associated with the male dimorphism index). In DESeq2, this corresponds to the design: ~ HARDY_SCALE + AGE_CONTINUOUS + SEX + SEX:RNA_DI (see next section).

3 Q2: the scale of DI-associated gene expression

3.1 Filtering the counts

  • Filter the genes with counts (too low and) too low variance:

Wikipedia: “In statistics, the median absolute deviation (MAD) is a robust measure of the variability of a univariate sample of quantitative data.” “For a univariate data set \(X_1, X_2, ..., X_n\), the MAD is defined as the median of the absolute deviations from the data’s median \(\tilde{X} = \text{median}(X)\):

\[\text{MAD} = \text{median}(|X_i - \tilde{X}|)\]

We will only keep the 50% most variant genes (50% of genes with the highest median absolute deviation):

Code
filtered_counts <- RNA_counts_adjusted |>
  rownames_to_column("gene") |> 
  as_tibble() |> 
  rowwise() |> 
  mutate(mad= c_across(2:ncol(RNA_counts_adjusted)) |> mad()) |> 
  ungroup() |> 
  slice_max(n = round(nrow(RNA_counts_adjusted) / 2), order_by = mad) |> 
  select(- mad) |> 
  as.data.frame() |> 
  column_to_rownames("gene")

The variables in the design formula cannot contain missing values (NAs). However, the HARDY_SCALE variable does contain missing values:

Code
cat("Missing HARDY_SCALE values:", is.na(covariates$HARDY_SCALE) |> sum())
Missing HARDY_SCALE values: 12

For simplicity, and since the number of samples with missing values is relatively small compared to the total number of samples (660 samples; The missing values thus represent 1.8181818%), we will simply discard the samples with a missing HARDY_SCALE value. Other, more sophisticated methods could have been used, such as replacing the missing values by the most common one (HARDY_SCALE = 0 with our data set), or creating a classifier model to predict the missing values using the other covariates (source?). We could also have replaced the “NA” value by a new category called “unknown”. However, since there are only 12 samples in that category, and there quite a lot of variables in the design, we would risk having a low statistical power (very few degrees of freedom).

Code
non_na_samples <- !is.na(covariates$HARDY_SCALE)

non_na_covariates <- covariates[non_na_samples,]

filtered_counts <- filtered_counts[, non_na_samples]

3.2 Differential gene expression analysis

Code
alpha <- 0.05

As recommended by DESeq2, we will center and scale the continuous variables (AGE_CONTINUOUS, and RNA_DI) to improve GLM convergence.

Code
non_na_covariates$AGE_CONTINUOUS <- scale(non_na_covariates$AGE_CONTINUOUS, center = TRUE, scale = TRUE)

non_na_covariates$RNA_DI <- scale(non_na_covariates$RNA_DI, center = TRUE, scale = TRUE)

Creating the DESeq2 object with the filtered counts:

Code
mt <- match(colnames(filtered_counts), non_na_covariates$SUBJID)

filtered_counts_ds <- DESeqDataSetFromMatrix(
  countData = filtered_counts,
  colData = non_na_covariates[mt, ],
  design = ~ HARDY_SCALE + AGE_CONTINUOUS + SEX + SEX:RNA_DI # Design decided in Q1
)
Code
filtered_counts_ds <- DESeq(filtered_counts_ds)

resultsNames(filtered_counts_ds)
[1] "Intercept"          "HARDY_SCALE_1_vs_0" "HARDY_SCALE_2_vs_0"
[4] "HARDY_SCALE_3_vs_0" "HARDY_SCALE_4_vs_0" "AGE_CONTINUOUS"    
[7] "SEX_M_vs_F"         "SEXF.RNA_DI"        "SEXM.RNA_DI"       

3.2.1 Genes associated with the female and male dimorphism indices

Code
# Genes associated with the female dimorphism index

rna_di_F_res <- results(filtered_counts_ds, name = "SEXF.RNA_DI", alpha = alpha)

NotSig_F <- sum(rna_di_F_res$padj >= alpha, na.rm = TRUE)
Down_F <- sum(rna_di_F_res$padj <= alpha & rna_di_F_res$log2FoldChange < 0, na.rm = TRUE)
Up_F <- sum(rna_di_F_res$padj <= alpha & rna_di_F_res$log2FoldChange >= 0, na.rm = TRUE)
Code
# Genes associated with the male dimorphism index

rna_di_M_res <- results(filtered_counts_ds, name = "SEXM.RNA_DI", alpha = alpha)

NotSig_M <- sum(rna_di_M_res$padj >= alpha, na.rm = TRUE)
Down_M <- sum(rna_di_M_res$padj <= alpha & rna_di_M_res$log2FoldChange < 0, na.rm = TRUE)
Up_M <- sum(rna_di_M_res$padj <= alpha & rna_di_M_res$log2FoldChange >= 0, na.rm = TRUE)
Code
rna_di_res_df <- data.frame("Female RNA DI" = c(Down_F, NotSig_F, Up_F),
                           "Male RNA DI" = c(Down_M, NotSig_M, Up_M))

rownames(rna_di_res_df) <- c("Down", "NotSig", "Up")

rna_di_res_df |> gt(rownames_to_stub = TRUE)
Female.RNA.DI Male.RNA.DI
Down 5536 2307
NotSig 2921 2209
Up 1350 5291

Volcano plots

Code
volcano_plot <- function(data, title){
  alpha <- 0.05
  
  data <- data |> 
    as.data.frame() |> 
    rownames_to_column("gene_id")
    
  
  data <- data |> filter(!is.na(pvalue))
  
  data <- data |> 
    arrange(padj) |>
    mutate(rank = row_number()) 
  
  # all genes are set to non-significant by default
  data$category <- "NS"
  
  # Significant genes + up/down-regulated
  data$category[data$padj < alpha & data$log2FoldChange < 0] <- "Down"
  data$category[data$padj < alpha & data$log2FoldChange > 0] <- "Up"
  
  data$category <- factor(data$category)
  
  counts <- c(
    "Down" = sum(data$category == "Down"),
    "NS" = sum(data$category == "NS"),
    "Up" = sum(data$category == "Up")) |> 
    as.character()
  
  # Plot colors
  plot_cols <- c("Down" =  hue_pal()(2)[1],
                 "NS" = "grey",
                 "Up" =  hue_pal()(2)[2])
  
  # Creating the volcano plot
  plot <- data |> ggplot(aes(x = log2FoldChange, y = -log10(pvalue), color = category)) +
    geom_point(alpha = 0.7) +
    geom_vline(xintercept = c(-1, 0, 1), col = "gray", linetype = "dashed") +
    # Name added to the plot for the 20 genes with lowest adjusted p-values
    ggrepel::geom_text_repel(
      data = data |> filter(rank <= 20, padj < rank),
      aes(label = gene_id),
      size = 3,
      show.legend = FALSE,
      max.overlaps = getOption("ggrepel.max.overlaps", default = 30)) + 
    scale_color_manual(values = plot_cols, labels = counts, name = "") +
    labs(title = title,
         x = expression("log"[2]*"Fold Change"),  
         y = expression("-log"[10]*"(p-value)")) +
    
    geom_hline(yintercept = -log10(alpha), linetype = "dashed", col = "grey") +
    theme_light() +
    theme(legend.position = "bottom")
    
    
    return(plot)
}

volcano_plot(rna_di_F_res, "Female RNA Dimorphism index") +
volcano_plot(rna_di_M_res, "Male RNA Dimorphism index")

(Interpretation of the log2 FC: If the variable of interest is continuous-valued, then the reported log2 fold change is per unit of change of that variable.)

Code
# Up- and down-regulated genes according to dimorphism index in females
set_up_F <- rna_di_F_res |> as.data.frame() |> filter(padj < alpha & log2FoldChange > 0) |> rownames()
set_down_F <- rna_di_F_res |> as.data.frame() |> filter(padj < alpha & log2FoldChange < 0) |> rownames()

set_di_F <- c(set_up_F, set_down_F)

# Up- and down-regulated genes according to dimorphism index in males
set_up_M <- rna_di_M_res |> as.data.frame() |> filter(padj < alpha & log2FoldChange > 0) |> rownames()
set_down_M <- rna_di_M_res |> as.data.frame() |> filter(padj < alpha & log2FoldChange < 0) |> rownames()

set_di_M <- c(set_up_M, set_down_M)
Code
venn_cols <- hue_pal()(4)

venn.plot <- venn.diagram(
  x = list(up_F = set_up_F, down_F = set_down_F, up_M = set_up_M, down_M = set_down_M),
  category.names = c("Females Up", "Females Down", "Males Up", "Males Down"),
  filename = NULL,
  fill = venn_cols,
  col = venn_cols,
  fontfamily = "sans",
  # Set names
  cat.cex = 1,
  cat.fontface = "bold",
  cat.default.pos = "outer",
  cat.fontfamily = "sans",
  cat.col = venn_cols,
  height = 480,
  width = 480,
  scaled = TRUE
)

# Plot the Venn diagram
grid.newpage()
grid.draw(venn.plot)

3.2.2 Genes associated with sex

Code
sex_res <- results(filtered_counts_ds, contrast = c("SEX", "M", "F")) # log2 Fold-Change = Male / Female

NotSig_SEX <- sum(sex_res$padj >= alpha, na.rm = TRUE)
Dow_SEX <- sum(sex_res$padj <= alpha & sex_res$log2FoldChange < 0, na.rm = TRUE)
Up_SEX <- sum(sex_res$padj <= alpha & sex_res$log2FoldChange > 0, na.rm = TRUE)

# Showing the results
sex_res_df <- data.frame("SEX_M_vs_F" = c(Dow_SEX, NotSig_SEX, Up_SEX))
rownames(sex_res_df) <- c("Down", "NotSig", "Up")
sex_res_df |> gt(rownames_to_stub = TRUE)
SEX_M_vs_F
Down 1181
NotSig 5832
Up 2794
Code
volcano_plot(sex_res, "Differentially expressed genes according to sex")

3.2.3 Genes associated with age

Code
age_res <- results(filtered_counts_ds, name = "AGE_CONTINUOUS", alpha = alpha)

NotSig_AGE <- sum(age_res$padj >= alpha, na.rm = TRUE)
Dow_AGE <- sum(age_res$padj <= alpha & age_res$log2FoldChange < 0, na.rm = TRUE)
Up_AGE <- sum(age_res$padj <= alpha & age_res$log2FoldChange > 0, na.rm = TRUE)

# Showing the results
age_res_df <- data.frame("AGE_CONTINUOUS" = c(Dow_AGE, NotSig_AGE, Up_AGE))
rownames(age_res_df) <- c("Down", "NotSig", "Up")
age_res_df |> gt(rownames_to_stub = TRUE)
AGE_CONTINUOUS
Down 465
NotSig 8894
Up 448
Code
volcano_plot(age_res, "Differentially expressed genes according to age (continuous)")

Heatmap of the VST transformed counts of the 30 most significant genes (lowest adjusted p-values), with samples annotated with sex, age, as color annotation.

Code
top30_age_genes <- age_res |> as.data.frame() |> 
  slice_min(padj, n = 30) |> 
  rownames()

female_samples <- covariates |> 
  filter(SEX == "F") |> 
  select(SUBJID) |> 
  unlist(use.names = FALSE)

male_samples <- covariates |> 
  filter(SEX == "M") |> 
  select(SUBJID) |> 
  unlist(use.names = FALSE)

# All sexes confounded
assay(vst_counts)[top30_age_genes,] |> 
  pheatmap(scale = "row",
           cluster_cols = TRUE,
           annotation_col = covariates |> column_to_rownames(var ="SUBJID") |>  select("AGE_DECADE", "SEX"))

Code
# Female heatmap
assay(vst_counts)[top30_age_genes, female_samples] |>
  pheatmap(scale = "row",
           cluster_cols = TRUE,
           annotation_col = covariates |> column_to_rownames(var ="SUBJID") |>  select("AGE_DECADE"))

Code
# Male heatmap
assay(vst_counts)[top30_age_genes, male_samples] |>
  pheatmap(scale = "row",
           cluster_cols = TRUE,
           show_colnames = FALSE,
           annotation_col = covariates |> column_to_rownames(var ="SUBJID") |>  select("AGE_DECADE"))

3.2.4 Genes associated with the hardy scale

Hypothèse principale testée en ANOVA 1: tester si toutes les populations sont de même moyenne (H0: mu1 = mu2 = mu3 = … = mu, H1: deux moyennes au moins sont différentes) –> We want to do something similar with the HARDY_SCALE variable. With the other covariates (SEX, RNA_DI and AGE_CONTINUOUS), we used the Wald significance test. The Wald test tests “for significance of coefficients in a Negative Binomial GLM” (H0: beta = 0). However, the Hardy scale has 5 different levels, and testing if genes are differentially expressed between each level (e.g. level 0 vs level 1, level 0 vs level 2, etc.), would require to make 10 (5*(5-1)/2) contrasts. This is called multiple testing and it would thus require corrections of the p-value to control the type 1 error rate.

But, with our biological question, which is to identify genes associated with sexual dimorphism in adipose tissue, knowing which genes are associated with each hardy scale level is not biologically interesting. Therefore, we will use the likelihood ratio test which is “an alternative to pair-wise comparisons” and is used “to analyze all levels of a factor at once” (like in ANOVA). (source: https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html)

Code
dds_lrt <- DESeq(filtered_counts_ds, test = "LRT", reduced = ~ AGE_CONTINUOUS + SEX + SEX:RNA_DI)
res_lrt <- results(dds_lrt)
Code
volcano_plot(res_lrt, "Hardy scale")

Code
sig_hardy_genes <- res_lrt |> 
  as.data.frame() |> 
  slice_min(order_by = padj, n = 9) |> 
  rownames()

length(sig_hardy_genes)
[1] 9
Code
fcounts_long <- filtered_counts |> rownames_to_column("gene_id") |> 
  pivot_longer(cols = - gene_id, names_to = "SUBJID", values_to = "counts")

fcounts_long <- left_join(fcounts_long, covariates |> select(SUBJID, HARDY_SCALE), by = "SUBJID")

fcounts_long |> 
  filter(gene_id %in% sig_hardy_genes) |> 
  ggplot(aes(x = HARDY_SCALE, y = counts, fill = HARDY_SCALE)) +
  geom_boxplot(outlier.size = 0.8, outlier.alpha = 0.6) +
  facet_wrap(~ gene_id, scales = "free_y", ncol = 3) +
  labs(x = "Hardy Scale",
       y = "Raw Counts") +
  ggtitle("Top 9 genes with lowest adjusted p-value for the \n 
          likelihood ratio test on HARDY_SCALE covariate")

Question about statistical power: see my project P3 from traitement statistique des données -omiques.

–> Résumé: table of (FDR) adjusted p-values or heatmap (gene vs. factor + color = adjusted p-value)

4 Q3:

  1. GSEA using fgsea package (see my main project traitement statistique des données -omiques) using the reactome gmt file? Or just find genes associated with receptors?

Annex: session information

Code
sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.4

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Brussels
tzcode source: internal

attached base packages:
[1] grid      stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] corrplot_0.95               VennDiagram_1.7.3          
 [3] futile.logger_1.4.3         cowplot_1.1.3              
 [5] viridis_0.6.5               viridisLite_0.4.2          
 [7] GGally_2.2.1                scales_1.3.0               
 [9] pheatmap_1.0.12             DESeq2_1.44.0              
[11] SummarizedExperiment_1.34.0 Biobase_2.64.0             
[13] MatrixGenerics_1.16.0       matrixStats_1.5.0          
[15] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
[17] IRanges_2.38.1              S4Vectors_0.42.1           
[19] BiocGenerics_0.50.0         patchwork_1.3.0            
[21] ggrepel_0.9.6               gt_0.11.1                  
[23] magrittr_2.0.3              lubridate_1.9.4            
[25] forcats_1.0.0               stringr_1.5.1              
[27] dplyr_1.1.4                 purrr_1.0.4                
[29] readr_2.1.5                 tidyr_1.3.1                
[31] tibble_3.2.1                ggplot2_3.5.1              
[33] tidyverse_2.0.0            

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1        farver_2.1.2            fastmap_1.2.0          
 [4] digest_0.6.37           timechange_0.3.0        lifecycle_1.0.4        
 [7] compiler_4.4.1          rlang_1.1.5             tools_4.4.1            
[10] yaml_2.3.10             lambda.r_1.2.4          knitr_1.50             
[13] labeling_0.4.3          S4Arrays_1.4.1          htmlwidgets_1.6.4      
[16] DelayedArray_0.30.1     plyr_1.8.9              xml2_1.3.8             
[19] RColorBrewer_1.1-3      abind_1.4-8             BiocParallel_1.38.0    
[22] withr_3.0.2             colorspace_2.1-1        cli_3.6.4              
[25] rmarkdown_2.29          crayon_1.5.3            generics_0.1.3         
[28] rstudioapi_0.17.1       httr_1.4.7              tzdb_0.5.0             
[31] zlibbioc_1.50.0         parallel_4.4.1          formatR_1.14           
[34] XVector_0.44.0          vctrs_0.6.5             Matrix_1.7-3           
[37] jsonlite_2.0.0          hms_1.1.3               locfit_1.5-9.12        
[40] glue_1.8.0              ggstats_0.9.0           codetools_0.2-20       
[43] stringi_1.8.7           gtable_0.3.6            UCSC.utils_1.0.0       
[46] munsell_0.5.1           pillar_1.10.1           htmltools_0.5.8.1      
[49] GenomeInfoDbData_1.2.12 R6_2.6.1                evaluate_1.0.3         
[52] lattice_0.22-6          futile.options_1.0.1    Rcpp_1.0.14            
[55] gridExtra_2.3           SparseArray_1.4.8       xfun_0.51              
[58] pkgconfig_2.0.3